Analysis of Pont-to-point experiments of MPI calls

If needed, you should install the right packages (plyr, ggplot2, and knitr) with the install.packages command.

#Impossible to use require with a functionparameter, so no function to avoid copying this n times
library("plyr")
library("ggplot2")
library("XML")

library(tidyr)
library(ggplot2)
library(dplyr)
library(rhdf5)
library(plotly)

library(tidyverse)
library(lubridate)
library(viridis)

Load XML config file and .csv resulting files from the MPI execution

config <- xmlParse(params$input)
prefix <- sapply(getNodeSet(config, "//prefix"), function(el) xmlGetAttr(el, "value"))
minSize <- sapply(getNodeSet(config, "//minSize"), function(el) xmlGetAttr(el, "value"))
maxSize <- sapply(getNodeSet(config, "//maxSize"), function(el) xmlGetAttr(el, "value"))
iterations <-  as.integer(sapply(getNodeSet(config, "//iterations"), function(el) xmlGetAttr(el, "value")))
breakpoints_file <- sapply(getNodeSet(config, "//breakpoints_file"), function(el) xmlGetAttr(el, "value"))
outliers_threshold <-  as.numeric(sapply(getNodeSet(config, "//outliers_threshold"), function(el) xmlGetAttr(el, "value")))

eager_threshold <- as.integer(sapply(getNodeSet(config, "//eager_threshold"), function(el) xmlGetAttr(el, "value")))
detached_threshold <- as.integer(sapply(getNodeSet(config, "//detached_threshold"), function(el) xmlGetAttr(el, "value")))

expected_bandwidth <- as.double(sapply(getNodeSet(config, "//expected_bandwidth"), function(el) xmlGetAttr(el, "value")))
expected_latency <- as.double(sapply(getNodeSet(config, "//expected_latency"), function(el) xmlGetAttr(el, "value")))


read_csv <- function(file) {
  df <- read.csv(file, header=FALSE, strip.white=TRUE)
  names(df) <- c("Op","Size","Start", "Duration")
  df$Origin=file
  df
}

remove_outliers <-function(duration, thres=outliers_threshold) {
  qnt <- quantile(duration, probs=c(thres, 1-thres))
  outliers <- ifelse((duration<qnt[1] | duration>qnt[2]), 1,0)
  outliers
}


df_pingpong <- read_csv(paste0(prefix,"_PingPong.csv"))
df_send <-  df_pingpong[df_pingpong$Op=="MPI_Send",]
df_recvs <- df_pingpong[df_pingpong$Op=="MPI_Recv",]


df_pingpong <- data.frame( Op = unique("PingPong"),
                   Size = df_send$Size,
                   Start = df_send$Start,
                   Duration = df_recvs$Duration + df_send$Duration,
                   Origin = unique(df_send$Origin))


df_isend <- read_csv(paste0(prefix,"_Isend.csv"))
df_recv <- read_csv(paste0(prefix,"_Recv.csv"))
df_wtime <- read_csv(paste0(prefix,"_Wtime.csv"))
df_test <- read_csv(paste0(prefix,"_Test.csv"))
df_iprobe <- read_csv(paste0(prefix,"_Iprobe.csv"))


#remove outliers
df_send$experiment <-floor((as.numeric(1:nrow(df_send))-1)/10)
df_send <- ddply(df_send, .(experiment), transform, outliers=remove_outliers(Duration))
df_send <-df_send [df_send$outliers==0,]

df_isend$experiment <-floor((as.numeric(1:nrow(df_isend))-1)/10)
df_isend <- ddply(df_isend, .(experiment), transform, outliers=remove_outliers(Duration))
df_isend <-df_isend [df_isend$outliers==0,]

df_recv$experiment <-floor((as.numeric(1:nrow(df_recv))-1)/10)
df_recv <- ddply(df_recv, .(experiment), transform, outliers=remove_outliers(Duration))

df_recv <-df_recv [df_recv$outliers==0,]

#read breakpoint file
options(stringsAsFactors = FALSE)

bp <- read.csv(breakpoints_file, header=TRUE, strip.white=TRUE)
vec<-list(.Machine$integer.max, "Large")
bp <- rbind(bp, vec)
v <- bp$Limit
   bp$LimitInf <- c(0,v[1:length(v)-1])
   bp$Name <- as.character(bp$Name)

#Analysis Functions

 classify_messages <- function(d) {
    d$group=as.character("outliers")
    for(i in (1:length(bp$Limit))) {
        d[d$Size < bp[i,]$Limit & d$Size >= bp[i,]$LimitInf,]$group = bp[i,]$Name
    }
    d
  }

  comm_function <- function(x,zone) {
        bp[x>=bp$LimitInf & x<bp$Limit,paste0(zone,"Intercept")] +
    x * bp[x>=bp$LimitInf & x<bp$Limit,paste0(zone,"Size")]
  }


  fancy_plot <- function(d,zone, title) {
    d$values_test <- sapply(d$Size,function(x) {comm_function(x,zone)})

    pl <- ggplot(data=d, aes(x=Size,y=Duration,color=group))
    pl <- pl + geom_vline(xintercept=bp$Limit, colour="gray", linetype = "longdash")  +  annotate("text", x=bp$Limit, y=seq(.1*max(d$Duration),2*max(d$Duration),length.out=length(bp$Limit)), label=bp$Name)
    pl <- pl + geom_point(alpha=.3)
    pl <- pl + geom_line(aes(x=Size,y=values_test), colour="black")
    pl <- pl + scale_x_log10(limits=c(1, max(d$Size))) + scale_y_log10()
    pl <- pl + ylab(paste0(title, " duration (seconds)")) + xlab("Message size (bytes)") + theme_bw() + scale_alpha(guide = "none")

    # ggsave(filename=paste("/tmp/", prefix,"_",title, ".png", sep=""),plot=last_plot())
    return(pl)
  }
  compute_lm <- function(bp, d, zone,remove_sync=FALSE,remove_detached=FALSE) {
    if(remove_sync) {
       group_list = bp[bp$Limit<=detached_threshold,]$Name
    } else {
       group_list = bp$Name
    }
    if(remove_detached) {
       group_list = bp[bp$Limit<=detached_threshold,]$Name
    }
    mylm <- ddply(d[d$group %in% (group_list),],
                   c("group"), function(x) {
                model <- lm(Duration ~ Size, data = x)
                coef(model)
                })
    intercept_label = paste0(zone,"Intercept")
    size_label = paste0(zone,"Size")
    bp[,intercept_label] = 0
    bp[,size_label] = 0

    for(i in (mylm$group)) {
       bp[bp$Name == i,intercept_label] = mylm[mylm$group == i,"(Intercept)"]
       bp[bp$Name == i,size_label] = mylm[mylm$group == i,"Size"]
    }
    bp
  }

  display_lm <- function(d,remove_sync=FALSE,remove_detached=FALSE) {
    if(remove_sync) {
       group_list = bp[bp$Limit<=detached_threshold,]$Name
    } else {
       group_list = bp$Name
    }
    if(remove_detached) {
       group_list = bp[bp$Limit<detached_threshold,]$Name
    }
    for(i in (1:length(group_list))) {
       print(paste0("----- ",bp[i,]$Name,"-----"))
       lim <- bp[i,]$Limit
       print(summary(lm(Duration ~ Size, data = d[d$group==bp[bp$Limit==lim,]$Name,])))
    }
  }


#Plotting functions

  comm_function <- function(x,zone) {
        bp[x>=bp$LimitInf & x<bp$Limit,paste0(zone,"Intercept")] +
    x * bp[x>=bp$LimitInf & x<bp$Limit,paste0(zone,"Size")]
  }


  fancy_plot <- function(d,zone, title) {
    d$values_test <- sapply(d$Size,function(x) {comm_function(x,zone)})

    pl <- ggplot(data=d, aes(x=Size,y=Duration,color=group))
    pl <- pl + geom_vline(xintercept=bp$Limit, colour="gray", linetype = "longdash")  +  annotate("text", x=bp$Limit, y=seq(.1*max(d$Duration),2*max(d$Duration),length.out=length(bp$Limit)), label=bp$Name)
    pl <- pl + geom_point(alpha=.3)
    pl <- pl + geom_line(aes(x=Size,y=values_test), colour="black")
    pl <- pl + scale_x_log10(limits=c(1, max(d$Size))) + scale_y_log10()
    pl <- pl + ylab(paste0(title, " duration (seconds)")) + xlab("Message size (bytes)") + theme_bw() + scale_alpha(guide = "none")

    # ggsave(filename=paste("/tmp/", prefix,"_",title, ".png", sep=""),plot=last_plot())
    return(pl)
  }

MPI_Send timing

Timings for this experiment are taken from a ping-pong experiment, used to determine os.

We determine the piecewiese regression based on information taken from the regression file pointed in the XML configuration file

print(bp)
##        Limit  Name LimitInf
## 1      15000 Small        0
## 2 2147483647 Large    15000
df_send <- classify_messages(df_send)
bp <- compute_lm(bp,df_send,"Os",remove_sync=TRUE)

Display the regression factors to help tuning.

display_lm(df_send,remove_sync=TRUE)
## [1] "----- Small-----"
## 
## Call:
## lm(formula = Duration ~ Size, data = d[d$group == bp[bp$Limit == 
##     lim, ]$Name, ])
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.150e-06 -4.486e-08  1.234e-08  8.321e-08  1.121e-06 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.875e-07  1.596e-09   430.7   <2e-16 ***
## Size        1.633e-10  6.207e-13   263.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.787e-07 on 17745 degrees of freedom
## Multiple R-squared:  0.796,  Adjusted R-squared:  0.796 
## F-statistic: 6.922e+04 on 1 and 17745 DF,  p-value: < 2.2e-16

Visual representation of the computed data, to visualize correctness of the computed value.

The black line representing the regression should be very close to the values, and should drop to 0 when communications use the rendez-vous algorithm (Large messages, with a size > eager_threshold).

If they are not, tune the breakpoints in order to match more closely to your implementation. Thresholds for eager and detached messages depend on the library and the hardware used. Consult the documentation of your library on how to display this information if you can’t visually determine it (For Ethernet network we saw values of 65536, while IB networks had values of 12288 or 17408 depending on the implementation)

fancy_plot(df_send[df_send$group!="outliers",],"Os", "MPI_Send")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_text).

MPI_Isend timing —————

As they may differ from Send times, check this and call it ois, to inject proper timings later.

df_isend <- classify_messages(df_isend)
# We want to inject timings in all MPI_Isend, even the large ones. Hence remove_sync=F
bp <- compute_lm(bp,df_isend,"Ois",remove_sync=F)

Display the regression factors to help tuning

display_lm(df_isend,remove_sync=T)
## [1] "----- Small-----"
## 
## Call:
## lm(formula = Duration ~ Size, data = d[d$group == bp[bp$Limit == 
##     lim, ]$Name, ])
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.085e-06 -6.830e-08  4.700e-09  8.750e-08  1.196e-05 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.898e-07  2.096e-09   329.1   <2e-16 ***
## Size        1.635e-10  8.152e-13   200.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.346e-07 on 17745 degrees of freedom
## Multiple R-squared:  0.6939, Adjusted R-squared:  0.6939 
## F-statistic: 4.024e+04 on 1 and 17745 DF,  p-value: < 2.2e-16

Visual representation of the computed data, to visualize correctness of the computed value

fancy_plot(df_isend[df_isend$group!="outliers",],"Ois", "MPI_Isend")
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_text).

MPI_Recv timing

Timings are used to determine or. This experiment waits for a potentially eager message to arrive before launching the recv for small message size, eliminating waiting times as much as possible.

df_recv <- classify_messages(df_recv)
bp <- compute_lm(bp,df_recv,"Or",remove_detached=TRUE)

Display the regression factors to help tuning

display_lm(df_recv,remove_detached=TRUE)
## [1] "----- Small-----"
## 
## Call:
## lm(formula = Duration ~ Size, data = d[d$group == bp[bp$Limit == 
##     lim, ]$Name, ])
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.106e-06 -5.100e-08  1.700e-09  6.450e-08  1.301e-05 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.945e-06  4.154e-09   468.1   <2e-16 ***
## Size        1.994e-10  1.614e-12   123.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.642e-07 on 17688 degrees of freedom
## Multiple R-squared:  0.4632, Adjusted R-squared:  0.4632 
## F-statistic: 1.527e+04 on 1 and 17688 DF,  p-value: < 2.2e-16

Visual representation of the computed data, to visualize correctness of the computed value

ggplotly(fancy_plot(df_recv[df_recv$group!="outliers",],"Or", "MPI_Recv"))
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: Transformation introduced infinite values in continuous y-axis
fancy_plot(df_recv[df_recv$group!="outliers",],"Or", "MPI_Recv")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_text).

Pingpong timing

pingpong = 2or+2transfer for small messages that are sent asynchronously. For large sizes, communications are synchronous, hence we have pingpong = 2transfer.

df_pingpong <- classify_messages(df_pingpong)
bp <- compute_lm(bp,df_pingpong,"PingPong")
display_lm(df_pingpong)
## [1] "----- Small-----"
## 
## Call:
## lm(formula = Duration ~ Size, data = d[d$group == bp[bp$Limit == 
##     lim, ]$Name, ])
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.672e-06 -8.739e-07  2.699e-07  7.184e-07  1.661e-05 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.131e-06  8.346e-09   614.8   <2e-16 ***
## Size        1.210e-09  3.240e-12   373.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.04e-06 on 21998 degrees of freedom
## Multiple R-squared:  0.8637, Adjusted R-squared:  0.8637 
## F-statistic: 1.394e+05 on 1 and 21998 DF,  p-value: < 2.2e-16
## 
## [1] "----- Large-----"
## 
## Call:
## lm(formula = Duration ~ Size, data = d[d$group == bp[bp$Limit == 
##     lim, ]$Name, ])
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.779e-05 -4.800e-07  5.780e-07  2.986e-06  8.357e-05 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.446e-05  1.027e-07   140.8   <2e-16 ***
## Size        3.816e-10  2.451e-13  1556.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.699e-06 on 11998 degrees of freedom
## Multiple R-squared:  0.9951, Adjusted R-squared:  0.9951 
## F-statistic: 2.424e+06 on 1 and 11998 DF,  p-value: < 2.2e-16
fancy_plot(df_pingpong[df_pingpong$group!="outliers",],"PingPong","PingPong")
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_text).

bp$DelayIntercept <- (bp$PingPongIntercept-bp$OrIntercept)/2
bp$DelaySize <- (bp$PingPongSize-bp$OrSize)/2

df_pingpong$values_test2 <- sapply(df_pingpong$Size,
    function(x) {comm_function(x,"Delay")})

df_pingpong$Duration2 = (df_pingpong$Duration -
   sapply(df_pingpong$Size,function(x) {comm_function(x,"Or")}))/2

pl <- ggplot(data=df_pingpong, aes(x=Size,y=Duration2,color=group))+geom_point(alpha=.2)
pl + geom_line(aes(x=Size,y=values_test2), colour="black") +scale_x_log10()+scale_y_log10()

Print results in Simgrid’s xml format

# Check that values are in (0, 1]
test_unit_interval <- function(value) {
  ifelse(value > 1, 1, value)
}

# Check that values are in [1, inf)
test_lat_interval <- function(value) {
  ifelse(value < 1, 1, value)
}

desc=data.frame()
desc=rbind(desc, ddply(bp,c("Name"), summarize, Name="smpi/os",Limit=Limit, desc=paste0(LimitInf,":",OsIntercept,":",OsSize)))
desc=rbind(desc, ddply(bp,c("Name"), summarize, Name="smpi/or",Limit=Limit, desc=paste0(LimitInf,":",OrIntercept,":",OrSize)))
desc=rbind(desc, ddply(bp,c("Name"), summarize, Name="smpi/ois",Limit=Limit, desc=paste0(LimitInf,":",OisIntercept,":",OisSize)))
desc=rbind(desc, ddply(bp,c("Name"), summarize, Name="smpi/bw-factor",Limit=Limit, desc=paste0(LimitInf,":",test_unit_interval(1/(DelaySize*expected_bandwidth)))))
desc=rbind(desc, ddply(bp,c("Name"), summarize, Name="smpi/lat-factor",Limit=Limit, desc=paste0(LimitInf,":",test_lat_interval(DelayIntercept/expected_latency))))

desc <- desc[with(desc, order(Name,Limit)),]
desc <- ddply(desc,c("Name"),summarize,label=paste(desc,collapse = ';'))

as <- xmlNode("prop", attrs = c(id="smpi/async-small-thres", value=eager_threshold))
de <- xmlNode("prop", attrs = c(id="smpi/send-is-detached-thres", value= detached_threshold))
bw <- xmlNode("prop", attrs = c(id="smpi/bw-factor", value=desc[desc$Name=="smpi/bw-factor",]$label))
lat <- xmlNode("prop", attrs = c(id="smpi/lat-factor", value=desc[desc$Name=="smpi/lat-factor",]$label))
os <- xmlNode("prop", attrs = c(id="smpi/os", value=desc[desc$Name=="smpi/os",]$label))
or <- xmlNode("prop", attrs = c(id="smpi/or", value=desc[desc$Name=="smpi/or",]$label))
ois <- xmlNode("prop", attrs = c(id="smpi/ois", value=desc[desc$Name=="smpi/ois",]$label))

a <- xmlNode("config", attrs = c(id="General"), os, ois, or, bw, lat, as, de)

MPI_Wtime timing

We made a run with 10 millions calls to MPI_Wtime and we want to know the time of one call

wtime=(df_wtime$Duration)/10000000

Time for one MPI_Wtime call

a <- addChildren(a, xmlNode("prop", attrs= c(id="smpi/wtime", value=paste0(wtime))))

MPI_Iprobe timing

We made 1000 runs of pingpong with pollling on MPI_Iprobe. Compute the Duration of such a call, and check whether its time is related to the size of the message

qplot(data=df_iprobe,y=Duration)

iprobe=mean(df_iprobe$Duration, trim=0.1)

Time for one MPI_Iprobe call

# print(iprobe)

a <- addChildren(a, xmlNode("prop", attrs= c(id="smpi/iprobe", value=paste0(iprobe))))

MPI_Test timing

qplot(data=df_test,y=Duration)

test=mean(df_test$Duration, trim=0.1)

Time for one MPI_Test call

# print(test)

a <- addChildren(a, xmlNode("prop", attrs= c(id="smpi/test", value=paste0(test))))

Result of calibration.

The following snippet of XML has to be included at the beginning of your platformfile. Please report to the SimGrid mailing list any bug with the calibration or the generated platform file.

# print(a)

print(paste0("Results written in ", paste0(prefix,".xml")))
## [1] "Results written in frog.xml"
saveXML(a, file=paste0(prefix, paste0("_output", ".xml")))
## [1] "frog_output.xml"